Risk Deep Dive¶

Purpose: Detailed risk analysis for data science / actuarial review

Run via: make run-notebooks MODE=nyc or make all-both

In [1]:
# ============================================================
# SETUP
# ============================================================
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

# Configuration from Makefile
RUN_DIR = os.environ.get("CITIBIKE_RUN_DIR")
mode = os.environ.get("CITIBIKE_MODE", "unknown")

if not RUN_DIR:
    raise ValueError("CITIBIKE_RUN_DIR not set.\nRun via Makefile: make run-notebooks MODE=nyc")

RUN_DIR = Path(RUN_DIR)
run_label = RUN_DIR.name

print(f"Mode: {mode}")
print(f"Run directory: {RUN_DIR}")
print(f"Run label: {run_label}")

# Load CSVs
df_year = pd.read_csv(RUN_DIR / "citibike_trips_by_year.csv")
df_month = pd.read_csv(RUN_DIR / "citibike_trips_by_month.csv")

# Scorecard (try 500m first)
scorecard_path = RUN_DIR / "axa_partner_scorecard_500m.csv"
if not scorecard_path.exists():
    for p in RUN_DIR.glob("axa_partner_scorecard_*m.csv"):
        scorecard_path = p
        break
df_score = pd.read_csv(scorecard_path) if scorecard_path.exists() else None

print(f"\nLoaded:")
print(f"  df_year:  {len(df_year)} rows")
print(f"  df_month: {len(df_month)} rows")
print(f"  df_score: {len(df_score) if df_score is not None else 'None'} rows")

if df_score is not None:
    print(f"\nScorecard columns: {list(df_score.columns)}")
    if "year" in df_score.columns:
        print(f"Years: {sorted(df_score['year'].dropna().unique())}")
    if "month" in df_score.columns:
        print(f"Months: {sorted(df_score['month'].dropna().unique())}")

# Figure output
FIG_DIR = RUN_DIR.parent.parent / "reports" / run_label / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)


def savefig(name):
    path = FIG_DIR / name
    plt.savefig(path, dpi=150, bbox_inches="tight")
    print(f"Saved: {path}")
Mode: nyc
Run directory: /home/hamzeh-khanpour/Desktop/CITIBIK-main/summaries/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc
Run label: y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc
Loaded:
  df_year:  4 rows
  df_month: 48 rows
  df_score: 72466 rows

Scorecard columns: ['mode', 'year', 'month', 'start_station_id', 'start_station_name', 'station_lat', 'station_lng', 'exposure_trips', 'crash_count', 'risk_rate_per_100k_trips', 'risk_rate_ci_low', 'risk_rate_ci_high', 'risk_proxy_available', 'credibility_flag', 'exposure_pct', 'risk_pct', 'axa_priority_score', 'prevention_hotspot', 'product_hotspot', 'acquisition_hotspot', 'exposure_index_pct', 'eb_risk_rate_per_100k_trips', 'risk_index_pct', 'expected_incidents_proxy', 'scoring_strategy', 'eb_m_prior_used']
Years: [np.int64(2019), np.int64(2020), np.int64(2023), np.int64(2025)]
Months: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12)]
In [2]:
# --- HIGH-RISK STATIONS ---
MIN_TRIPS = 5000
TOP_N = 10
HIGH_RISK_PCT = 90

if df_score is None or len(df_score) == 0:
    print("No scoring data found (df_score is missing/empty).")
else:
    print("\n" + "=" * 70)
    print("HIGH-RISK STATIONS ANALYSIS")
    print("=" * 70)

    s = df_score.copy()

    # Normalize column names
    s = s.rename(
        columns={
            "start_station_id": "station_id",
            "start_station_name": "station_name",
            "exposure_trips": "trips",
        },
        errors="ignore",
    )

    # Identify columns
    id_col = "station_id" if "station_id" in s.columns else None
    name_col = "station_name" if "station_name" in s.columns else None
    rate_col = (
        "eb_risk_rate_per_100k_trips"
        if "eb_risk_rate_per_100k_trips" in s.columns
        else "risk_rate_per_100k_trips"
    )

    has_year = "year" in s.columns
    has_month = "month" in s.columns

    # Clean numeric columns
    s["trips"] = pd.to_numeric(s.get("trips", 0), errors="coerce")
    s[rate_col] = pd.to_numeric(s[rate_col], errors="coerce")
    if "crash_count" in s.columns:
        s["crash_count"] = pd.to_numeric(s["crash_count"], errors="coerce")
    if has_year:
        s["year"] = pd.to_numeric(s["year"], errors="coerce").astype("Int64")
    if has_month:
        s["month"] = pd.to_numeric(s["month"], errors="coerce").astype("Int64")

    # Filter to credible
    if "credibility_flag" in s.columns:
        credible = s[s["credibility_flag"] == "credible"].copy()
    else:
        credible = s[s["trips"] >= MIN_TRIPS].copy()
        print(f"Note: no credibility_flag column; using trips ≥ {MIN_TRIPS:,} as credible.")

    credible = credible.dropna(subset=["trips", rate_col])
    credible = credible[credible["trips"] >= MIN_TRIPS].copy()

    if len(credible) == 0:
        print("No credible rows with usable risk data.")
    else:
        # High risk threshold
        cutoff = float(np.nanpercentile(credible[rate_col].values, HIGH_RISK_PCT))
        high_risk = credible[credible[rate_col] >= cutoff].copy()

        print(
            f"High-risk threshold: {HIGH_RISK_PCT}th percentile of {rate_col} among credible rows"
        )
        print(f"Cutoff value: {cutoff:.2f}")
        print(f"High-risk rows: {len(high_risk):,}")
        print(f"Total exposure in high-risk rows: {high_risk['trips'].sum():,.0f} trips")

        # Table columns
        cols_to_show = [
            c
            for c in [
                "mode",
                "year" if has_year else None,
                "month" if has_month else None,
                id_col,
                name_col,
                "station_lat",
                "station_lng",
                "trips",
                "crash_count",
                rate_col,
            ]
            if c and c in high_risk.columns
        ]

        top_risk = high_risk.sort_values([rate_col, "trips"], ascending=False).head(TOP_N)[
            cols_to_show
        ]

        print(f"\nTop {TOP_N} Highest-Risk (credible) station-periods:")
        display(top_risk)

        # Scope info
        if has_year and has_month:
            scope = (
                s.dropna(subset=["year", "month"])
                .groupby(["year", "month"])
                .size()
                .reset_index(name="rows")
            )
            print("\nScorecard scope (rows by year-month):")
            print(scope.sort_values(["year", "month"]).to_string(index=False))

        # Plot
        top_plot = top_risk.sort_values(rate_col, ascending=True).copy()

        fig, ax = plt.subplots(figsize=(12, 6))
        y = np.arange(len(top_plot))
        vals = top_plot[rate_col].values

        ax.barh(y, vals, edgecolor="black")
        ax.set_yticks(y)
        ax.set_yticklabels([str(n)[:40] for n in top_plot[name_col]], fontsize=9)
        ax.set_xlabel("Risk rate per 100k trips", fontsize=11)
        ax.set_title(
            f"Top {TOP_N} Highest-Risk Station-Periods (Credible)", fontsize=12, fontweight="bold"
        )
        ax.grid(axis="x", alpha=0.3)

        # Right-side year/month columns
        x_max = float(np.nanmax(vals)) if len(vals) else 1.0
        pad = 0.18 * x_max
        ax.set_xlim(0, x_max + pad)

        x_sep = x_max + pad * 0.08
        x_year = x_max + pad * 0.35
        x_month = x_max + pad * 0.70

        ax.axvline(x_sep, color="gray", linewidth=1, alpha=0.6)

        if has_year and has_month:
            ax.text(
                x_year,
                len(y) - 0.15,
                "Year",
                ha="center",
                va="bottom",
                fontsize=10,
                fontweight="bold",
            )
            ax.text(
                x_month,
                len(y) - 0.15,
                "Month",
                ha="center",
                va="bottom",
                fontsize=10,
                fontweight="bold",
            )

            for i, (_, row) in enumerate(top_plot.iterrows()):
                yr_txt = "" if pd.isna(row.get("year")) else str(int(row["year"]))
                mo_txt = "" if pd.isna(row.get("month")) else str(int(row["month"])).zfill(2)
                ax.text(x_year, i, yr_txt, ha="center", va="center", fontsize=9)
                ax.text(x_month, i, mo_txt, ha="center", va="center", fontsize=9)

        plt.tight_layout()
        savefig("top10_highest_risk_station_periods.png")
        plt.show()
======================================================================
HIGH-RISK STATIONS ANALYSIS
======================================================================
High-risk threshold: 90th percentile of eb_risk_rate_per_100k_trips among credible rows
Cutoff value: 628.72
High-risk rows: 1,646
Total exposure in high-risk rows: 12,567,625 trips

Top 10 Highest-Risk (credible) station-periods:
mode year month station_id station_name station_lat station_lng trips crash_count eb_risk_rate_per_100k_trips
1938 nyc 2019 6 3734 E 58 St & 1 Ave (NW Corner) 40.759125 -73.962658 5008 116 2068.172197
308 nyc 2019 2 529.0 W 42 St & 8 Ave 40.757570 -73.990985 5098 109 2035.034813
1843 nyc 2019 5 3712 W 35 St & Dyer Ave 40.754692 -73.997402 5486 120 1999.473641
3297 nyc 2019 5 3749 Lexington Ave & E 36 St 40.747574 -73.978801 5108 112 1992.237399
213 nyc 2019 2 401.0 Allen St & Rivington St 40.720196 -73.989978 5270 109 1979.209296
802 nyc 2019 10 464 E 56 St & 3 Ave 40.759345 -73.967597 5731 122 1934.592498
42 nyc 2019 12 529 W 42 St & 8 Ave 40.757570 -73.990985 5672 112 1931.177105
609 nyc 2019 11 3236 W 42 St & Dyer Ave 40.758985 -73.993800 5081 106 1926.525165
496 nyc 2019 1 529.0 W 42 St & 8 Ave 40.757570 -73.990985 5319 105 1907.926813
690 nyc 2019 2 531.0 Forsyth St & Broome St 40.718939 -73.992663 5339 104 1878.788813
Scorecard scope (rows by year-month):
 year  month  rows
 2019      1   769
 2019      2   767
 2019      3   770
 2019      4   786
 2019      5   796
 2019      6   795
 2019      7   790
 2019      8   795
 2019      9   808
 2019     10   840
 2019     11   870
 2019     12   881
 2020      1   917
 2020      2   908
 2020      3   904
 2020      4   900
 2020      5   940
 2020      6   978
 2020      7  1008
 2020      8  1055
 2020      9  1106
 2020     10  1162
 2020     11  1166
 2020     12  1189
 2023      1  1817
 2023      2  1872
 2023      3  1898
 2023      4  1912
 2023      5  1910
 2023      6  1923
 2023      7  1964
 2023      8  2014
 2023      9  2060
 2023     10  2180
 2023     11  2103
 2023     12  2202
 2025      1  2252
 2025      2  2244
 2025      3  2247
 2025      4  2247
 2025      5  2245
 2025      6  2207
 2025      7  2206
 2025      8  2198
 2025      9  2213
 2025     10  2203
 2025     11  2222
 2025     12  2227
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/top10_highest_risk_station_periods.png
No description has been provided for this image
In [3]:
# --- Risk vs Exposure: Multi-panel analysis ---
import math

if df_score is None or len(df_score) == 0:
    print("No scorecard data — skipping panel plots")
else:
    # Prepare data
    plot_df = df_score.copy()

    # Normalize columns
    plot_df = plot_df.rename(columns={"exposure_trips": "trips"}, errors="ignore")

    exposure_col = "trips"
    rate_col = "eb_risk_rate_per_100k_trips"
    crash_col = "crash_count" if "crash_count" in plot_df.columns else None

    # Filter to credible
    if "credibility_flag" in plot_df.columns:
        plot_df = plot_df[plot_df["credibility_flag"] == "credible"].copy()

    # Clean types
    plot_df[exposure_col] = pd.to_numeric(plot_df[exposure_col], errors="coerce")
    plot_df[rate_col] = pd.to_numeric(plot_df[rate_col], errors="coerce")
    if crash_col:
        plot_df[crash_col] = pd.to_numeric(plot_df[crash_col], errors="coerce").fillna(0)

    plot_df = plot_df.dropna(subset=[exposure_col, rate_col])
    plot_df = plot_df[plot_df[exposure_col] > 0].copy()

    # Check if we have any data left
    if len(plot_df) == 0:
        print(f"No credible data for mode={mode} — skipping panel plots")
    else:
        has_period_cols = "year" in plot_df.columns and "month" in plot_df.columns

        # Helper functions
        def draw_scatter(ax, df_slice):
            if crash_col:
                return ax.scatter(
                    df_slice[exposure_col],
                    df_slice[rate_col],
                    c=df_slice[crash_col],
                    alpha=0.6,
                    s=30,
                    vmin=vmin,
                    vmax=vmax,
                )
            else:
                ax.scatter(df_slice[exposure_col], df_slice[rate_col], alpha=0.6, s=30)
                return None

        def mark_hotspots(ax, df_slice):
            if "prevention_hotspot" in df_slice.columns:
                hotspots = df_slice[df_slice["prevention_hotspot"] == True]
                if len(hotspots) > 0:
                    ax.scatter(
                        hotspots[exposure_col],
                        hotspots[rate_col],
                        color="red",
                        s=130,
                        marker="*",
                        edgecolors="black",
                        linewidths=1,
                        label="Prevention Hotspots",
                        zorder=5,
                    )
                    ax.legend(loc="best", fontsize=7)

        def style_axes(ax, title):
            ax.set_xlabel("Exposure (trips)", fontsize=9)
            ax.set_ylabel("EB Risk Rate (per 100k trips)", fontsize=9)
            ax.set_title(title, fontsize=10, fontweight="bold")
            ax.set_xscale("log")
            ax.grid(alpha=0.3)

        # Global color scale
        vmin = vmax = None
        if crash_col and len(plot_df) > 0:
            vmin, vmax = float(plot_df[crash_col].min()), float(plot_df[crash_col].max())

        if not has_period_cols:
            # Overall plot only
            print(f"No year/month columns for mode={mode}. Making overall plot only.")

            fig, ax = plt.subplots(figsize=(7.2, 4.8))
            sc = draw_scatter(ax, plot_df)
            mark_hotspots(ax, plot_df)
            style_axes(ax, f"Risk vs Exposure (overall) — mode={mode}")

            if sc is not None:
                plt.colorbar(sc, ax=ax, label="Crash Count")

            savefig(f"risk_vs_exposure_overall_mode{mode}.png")
            plt.show()

        else:
            # Prepare year/month
            plot_df["year"] = pd.to_numeric(plot_df["year"], errors="coerce")
            plot_df["month"] = pd.to_numeric(plot_df["month"], errors="coerce")
            plot_df = plot_df.dropna(subset=["year", "month"])
            plot_df["year"] = plot_df["year"].astype(int)
            plot_df["month"] = plot_df["month"].astype(int)

            years_to_plot = sorted(plot_df["year"].unique())
            months_to_plot = sorted(plot_df["month"].unique())

            # Guard against empty data after filtering
            if not years_to_plot or not months_to_plot:
                print(f"No data after year/month filtering for mode={mode} — skipping panel plots")
            else:
                print(
                    f"[panel] mode={mode} years={years_to_plot} months={months_to_plot} rows={len(plot_df):,}"
                )

                # =========================================================
                # PANEL A: One subplot per (year, month) - paginated
                # =========================================================
                pairs = [(y, m) for y in years_to_plot for m in months_to_plot]
                total = len(pairs)

                PANELS_PER_PAGE = 12
                NCOLS = 3
                NROWS = int(math.ceil(PANELS_PER_PAGE / NCOLS))
                pages = int(math.ceil(total / PANELS_PER_PAGE))

                print(f"[panelA] Total panels={total} -> pages={pages}")

                for page_idx in range(pages):
                    start = page_idx * PANELS_PER_PAGE
                    page_pairs = pairs[start : start + PANELS_PER_PAGE]

                    figA, axesA = plt.subplots(
                        NROWS, NCOLS, figsize=(4.6 * NCOLS, 3.6 * NROWS), squeeze=False
                    )
                    scatter_for_cbar = None

                    for i, (y, m) in enumerate(page_pairs):
                        r, c = divmod(i, NCOLS)
                        ax = axesA[r][c]

                        d = plot_df[(plot_df["year"] == y) & (plot_df["month"] == m)]
                        sc = draw_scatter(ax, d)
                        mark_hotspots(ax, d)
                        style_axes(ax, f"{y}-{m:02d} (mode={mode})")

                        if scatter_for_cbar is None and sc is not None:
                            scatter_for_cbar = sc

                    # Turn off unused axes
                    for j in range(len(page_pairs), NROWS * NCOLS):
                        r, c = divmod(j, NCOLS)
                        axesA[r][c].axis("off")

                    figA.suptitle(
                        f"Risk vs Exposure — mode={mode} (page {page_idx + 1}/{pages})",
                        fontsize=13,
                        fontweight="bold",
                    )
                    figA.tight_layout(rect=[0.0, 0.0, 1.0, 0.93])

                    if scatter_for_cbar is not None:
                        import matplotlib as mpl

                        sm = mpl.cm.ScalarMappable(
                            norm=scatter_for_cbar.norm, cmap=scatter_for_cbar.cmap
                        )
                        sm.set_array([])
                        visible_axes = [
                            ax for ax in axesA.ravel() if ax.get_visible() and ax.has_data()
                        ]
                        cbar = figA.colorbar(sm, ax=visible_axes, fraction=0.035, pad=0.02)
                        cbar.set_label("Crash Count")

                    savefig(f"risk_vs_exposure_panel_periods_mode{mode}_page{page_idx + 1:02d}.png")
                    plt.show()

                # =========================================================
                # PANEL B: Yearly comparison (overlay years) - one subplot per month
                # =========================================================
                n = len(months_to_plot)
                ncols = min(3, n) if n > 1 else 1
                nrows = int(math.ceil(n / ncols))

                figB, axesB = plt.subplots(
                    nrows, ncols, figsize=(4.6 * ncols, 3.6 * nrows), squeeze=False
                )

                # Auto-select years to overlay (endpoints if too many)
                if len(years_to_plot) > 3:
                    years_overlay = [min(years_to_plot), max(years_to_plot)]
                    overlay_tag = f"endpoints_{years_overlay[0]}_{years_overlay[1]}"
                else:
                    years_overlay = years_to_plot
                    overlay_tag = "allYears"

                for i, m in enumerate(months_to_plot):
                    r, c = divmod(i, ncols)
                    ax = axesB[r][c]

                    any_data = False
                    for y in years_overlay:
                        d = plot_df[(plot_df["year"] == y) & (plot_df["month"] == m)]
                        if d.empty:
                            continue
                        any_data = True
                        ax.scatter(d[exposure_col], d[rate_col], alpha=0.5, s=22, label=str(y))
                        mark_hotspots(ax, d)

                    style_axes(ax, f"Year comparison — month {m:02d} (mode={mode})")
                    if any_data:
                        ax.legend(title="Year", fontsize=8, title_fontsize=9, loc="best")
                    else:
                        ax.text(
                            0.5, 0.5, "No data", transform=ax.transAxes, ha="center", va="center"
                        )
                        ax.set_axis_off()

                # Turn off unused axes
                for j in range(n, nrows * ncols):
                    r, c = divmod(j, ncols)
                    axesB[r][c].axis("off")

                figB.suptitle(
                    f"Yearly comparison (overlay: {overlay_tag}) — mode={mode}",
                    fontsize=13,
                    fontweight="bold",
                )
                figB.tight_layout(rect=[0.0, 0.0, 1.0, 0.93])

                savefig(f"risk_vs_exposure_panel_year_compare_mode{mode}_{overlay_tag}.png")
                plt.show()
[panel] mode=nyc years=[np.int64(2019), np.int64(2020), np.int64(2023), np.int64(2025)] months=[np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12)] rows=16,454
[panelA] Total panels=48 -> pages=4
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page01.png
No description has been provided for this image
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page02.png
No description has been provided for this image
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page03.png
No description has been provided for this image
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page04.png
No description has been provided for this image
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_year_compare_modenyc_endpoints_2019_2025.png
No description has been provided for this image
In [4]:
# --- Check if seasonality analysis is possible ---
can_do_seasonality = False

if df_score is None:
    print("=" * 80)
    print("  SEASONALITY ANALYSIS SKIPPED")
    print("=" * 80)
    print("No scorecard data available.")
elif "year" not in df_score.columns or "month" not in df_score.columns:
    print("=" * 80)
    print("  SEASONALITY ANALYSIS SKIPPED")
    print("=" * 80)
    print("No year/month columns in scorecard.")
else:
    if "credibility_flag" in df_score.columns:
        credible = df_score[df_score["credibility_flag"] == "credible"].copy()
    else:
        credible = df_score.copy()

    if len(credible) == 0:
        print("=" * 80)
        print("  SEASONALITY ANALYSIS SKIPPED")
        print("=" * 80)
        print(f"No credible data for {mode} (min_trips threshold not met).")
        print("This is expected for JC mode (smaller system).")
    else:
        can_do_seasonality = True
        print(f"✓ Seasonality analysis available: {len(credible):,} credible rows")
✓ Seasonality analysis available: 16,454 credible rows
In [5]:
# --- Seasonality: line chart (median risk by month, per year) ---
if not can_do_seasonality:
    print("Skipped")
else:
    risk_col = (
        "eb_risk_rate_per_100k_trips"
        if "eb_risk_rate_per_100k_trips" in credible.columns
        else "risk_rate_per_100k_trips"
    )

    # Aggregate: median + IQR per (year, month)
    summary = (
        credible.groupby(["year", "month"])[risk_col]
        .agg(median="median", q25=lambda x: x.quantile(0.25), q75=lambda x: x.quantile(0.75))
        .reset_index()
    )

    display(summary)

    years = sorted(summary["year"].unique())

    plt.figure(figsize=(10, 5))

    for year in years:
        sub = summary[summary["year"] == year].sort_values("month")
        plt.plot(sub["month"], sub["median"], marker="o", linewidth=2, label=str(int(year)))
        plt.fill_between(sub["month"], sub["q25"], sub["q75"], alpha=0.15)

    plt.title(
        f"Risk Seasonality (Median ± IQR) — mode={mode} ({run_label})",
        fontsize=14,
        fontweight="bold",
    )
    plt.xlabel("Month", fontsize=12)
    plt.ylabel("EB Risk Rate (per 100k trips)", fontsize=12)
    plt.xticks(range(1, 13))
    plt.legend(title="Year")
    plt.grid(True, alpha=0.3, linestyle="--")
    plt.tight_layout()

    savefig("03_seasonality_lines.png")
    plt.show()
year month median q25 q75
0 2019 1 931.674908 703.777840 1163.266958
1 2019 2 895.390758 702.786556 1062.695457
2 2019 3 795.401601 587.429084 1044.317599
3 2019 4 605.555216 417.475105 821.939478
4 2019 5 653.168411 435.096762 895.536100
5 2019 6 569.709310 387.226379 841.979261
6 2019 7 536.728441 356.427404 755.142102
7 2019 8 474.230785 336.443408 684.216414
8 2019 9 505.657358 358.239322 730.963851
9 2019 10 541.362887 380.519044 801.176152
10 2019 11 651.113335 494.546484 871.402656
11 2019 12 824.500635 678.058527 1110.545835
12 2020 1 599.547443 503.391432 762.983857
13 2020 2 673.577322 511.836083 868.001991
14 2020 3 489.893837 348.230712 609.022613
15 2020 4 114.238527 89.956749 159.469111
16 2020 5 144.061003 100.948964 192.333876
17 2020 6 159.051868 109.662597 224.807339
18 2020 7 203.061124 144.667290 272.919101
19 2020 8 200.027953 134.095910 295.082461
20 2020 9 197.555810 136.425498 284.274431
21 2020 10 237.762271 176.528517 319.912636
22 2020 11 243.871772 191.837125 332.153269
23 2020 12 329.758871 262.120264 410.082584
24 2023 1 323.056529 252.382544 389.650108
25 2023 2 315.262634 249.117389 377.028163
26 2023 3 310.508507 238.036592 397.504807
27 2023 4 270.760531 192.402170 347.861949
28 2023 5 252.071002 179.770216 352.331997
29 2023 6 255.286132 181.410071 348.992079
30 2023 7 226.964927 163.251120 303.210564
31 2023 8 222.354278 160.865127 301.553168
32 2023 9 251.951900 172.147280 332.124428
33 2023 10 258.111241 185.157357 355.594613
34 2023 11 264.715850 202.572098 357.700757
35 2023 12 343.304628 255.790919 432.743387
36 2025 1 268.644577 201.293836 339.327696
37 2025 2 243.815682 189.264557 321.634549
38 2025 3 227.859349 165.105564 308.368454
39 2025 4 213.642906 152.560479 278.695432
40 2025 5 229.218154 159.952777 302.974368
41 2025 6 191.943842 140.091285 255.723631
42 2025 7 175.714952 122.485741 237.394042
43 2025 8 166.907529 116.882975 229.830537
44 2025 9 181.136789 124.492817 260.641381
45 2025 10 197.237487 136.353898 265.805491
46 2025 11 235.411327 176.371779 317.774910
47 2025 12 323.986555 249.000950 419.496738
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/03_seasonality_lines.png
No description has been provided for this image
In [6]:
# --- Seasonality: heatmap (year × month) ---
if not can_do_seasonality:
    print("Skipped")
else:
    risk_col = (
        "eb_risk_rate_per_100k_trips"
        if "eb_risk_rate_per_100k_trips" in credible.columns
        else "risk_rate_per_100k_trips"
    )

    pivot = credible.groupby(["year", "month"])[risk_col].median().unstack()

    fig, ax = plt.subplots(figsize=(10, 3 + 0.5 * len(pivot)))

    im = ax.imshow(pivot.values, aspect="auto", cmap="YlOrRd")

    ax.set_xticks(range(len(pivot.columns)))
    ax.set_xticklabels([f"{int(m):02d}" for m in pivot.columns])
    ax.set_yticks(range(len(pivot.index)))
    ax.set_yticklabels([str(int(y)) for y in pivot.index])

    ax.set_xlabel("Month", fontsize=12)
    ax.set_ylabel("Year", fontsize=12)
    ax.set_title(
        f"Risk Heatmap (Year × Month) — mode={mode} ({run_label})", fontsize=14, fontweight="bold"
    )

    cbar = plt.colorbar(im, ax=ax, fraction=0.03)
    cbar.set_label("Median EB Risk Rate")

    # Add text labels
    for i in range(len(pivot.index)):
        for j in range(len(pivot.columns)):
            val = pivot.iloc[i, j]
            if not pd.isna(val):
                color = "white" if val > pivot.values[~np.isnan(pivot.values)].mean() else "black"
                ax.text(j, i, f"{val:.0f}", ha="center", va="center", color=color, fontsize=9)

    plt.tight_layout()
    savefig("04_seasonality_heatmap.png")
    plt.show()
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/04_seasonality_heatmap.png
No description has been provided for this image
In [7]:
# --- Boxplot: risk distribution by year ---
if not can_do_seasonality:
    print("Skipped")
else:
    risk_col = (
        "eb_risk_rate_per_100k_trips"
        if "eb_risk_rate_per_100k_trips" in credible.columns
        else "risk_rate_per_100k_trips"
    )

    years = sorted(credible["year"].unique())
    data_by_year = [credible[credible["year"] == y][risk_col].dropna().values for y in years]

    fig, ax = plt.subplots(figsize=(8, 5))

    bp = ax.boxplot(data_by_year, tick_labels=[str(int(y)) for y in years], showfliers=False)

    for median in bp["medians"]:
        median.set_color("red")
        median.set_linewidth(2)

    ax.set_xlabel("Year", fontsize=12)
    ax.set_ylabel("EB Risk Rate (per 100k trips)", fontsize=12)
    ax.set_title(
        f"Risk Distribution by Year — mode={mode} ({run_label})", fontsize=14, fontweight="bold"
    )
    ax.grid(axis="y", alpha=0.3, linestyle="--")

    from matplotlib.lines import Line2D

    ax.legend([Line2D([0], [0], color="red", linewidth=2)], ["Median"], loc="best")

    plt.tight_layout()
    savefig("05_distribution_boxplot.png")
    plt.show()
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/05_distribution_boxplot.png
No description has been provided for this image
In [8]:
# --- Slopegraph: station-level change between years ---
if not can_do_seasonality:
    print("Skipped")
else:
    id_col = "start_station_id" if "start_station_id" in credible.columns else None
    risk_col = (
        "eb_risk_rate_per_100k_trips"
        if "eb_risk_rate_per_100k_trips" in credible.columns
        else "risk_rate_per_100k_trips"
    )
    exp_col = "exposure_trips" if "exposure_trips" in credible.columns else "trips"

    if id_col is None:
        print("No station ID column — skipping slopegraph")
    else:
        years = sorted(credible["year"].unique())

        if len(years) < 2:
            print("Need at least 2 years for slopegraph")
        else:
            y0, y1 = min(years), max(years)

            # Aggregate: weighted average across months
            def weighted_avg(group):
                weights = group[exp_col].values
                values = group[risk_col].values
                if weights.sum() == 0:
                    return np.nan
                return np.average(values, weights=weights)

            agg = credible.groupby([id_col, "year"]).apply(weighted_avg).reset_index()
            agg.columns = [id_col, "year", "risk"]

            df0 = agg[agg["year"] == y0][[id_col, "risk"]].rename(columns={"risk": "risk0"})
            df1 = agg[agg["year"] == y1][[id_col, "risk"]].rename(columns={"risk": "risk1"})

            merged = df0.merge(df1, on=id_col).dropna()

            if len(merged) == 0:
                print(f"No stations present in both {y0} and {y1}")
            else:
                merged = merged.head(100)  # Limit for readability

                fig, ax = plt.subplots(figsize=(8, 6))

                for _, row in merged.iterrows():
                    ax.plot(
                        [0, 1], [row["risk0"], row["risk1"]], alpha=0.2, color="gray", linewidth=1
                    )

                ax.plot(
                    [0, 1],
                    [merged["risk0"].median(), merged["risk1"].median()],
                    color="red",
                    linewidth=3,
                    label="Median",
                )

                ax.set_xticks([0, 1])
                ax.set_xticklabels([str(y0), str(y1)])
                ax.set_ylabel("EB Risk Rate (per 100k trips)", fontsize=12)
                ax.set_title(
                    f"Station-Level Risk Change — mode={mode} ({run_label})",
                    fontsize=14,
                    fontweight="bold",
                )
                ax.legend()
                ax.grid(True, alpha=0.3, linestyle="--")

                plt.tight_layout()
                savefig("06_station_change_slopegraph.png")
                plt.show()

                improved = (merged["risk1"] < merged["risk0"]).sum()
                worsened = (merged["risk1"] > merged["risk0"]).sum()
                print(f"\nStations improved: {improved}")
                print(f"Stations worsened: {worsened}")
No stations present in both 2019 and 2025

/tmp/ipykernel_5664/1551659767.py:31: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  agg = credible.groupby([id_col, "year"]).apply(weighted_avg).reset_index()
In [9]:
# --- Summary ---
print("=" * 60)
print(f"RISK ANALYSIS SUMMARY — mode={mode} ({run_label})")
print("=" * 60)

if df_score is None:
    print("No scorecard data available")
else:
    exp_col = "exposure_trips" if "exposure_trips" in df_score.columns else "trips"
    risk_col = (
        "eb_risk_rate_per_100k_trips"
        if "eb_risk_rate_per_100k_trips" in df_score.columns
        else "risk_rate_per_100k_trips"
    )

    print(f"\nTotal rows in scorecard: {len(df_score):,}")

    if "credibility_flag" in df_score.columns:
        n_credible = (df_score["credibility_flag"] == "credible").sum()
        print(f"Credible station-periods: {n_credible:,}")

    print(f"\nExposure (trips):")
    print(f"  Total: {df_score[exp_col].sum():,.0f}")
    print(f"  Median per station-period: {df_score[exp_col].median():,.0f}")

    print(f"\nRisk ({risk_col}):")
    print(f"  Mean: {df_score[risk_col].mean():.2f}")
    print(f"  Median: {df_score[risk_col].median():.2f}")
    print(f"  Std: {df_score[risk_col].std():.2f}")

    if "prevention_hotspot" in df_score.columns:
        n_hotspots = df_score["prevention_hotspot"].sum()
        print(f"\nPrevention hotspots: {n_hotspots:,}")

    if "eb_m_prior_used" in df_score.columns:
        m_val = df_score["eb_m_prior_used"].iloc[0]
        print(f"\nEB prior strength (m): {m_val:,.0f}")

print("\n" + "=" * 60)
============================================================
RISK ANALYSIS SUMMARY — mode=nyc (y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc)
============================================================

Total rows in scorecard: 72,466
Credible station-periods: 16,454

Exposure (trips):
  Total: 241,616,968
  Median per station-period: 1,721

Risk (eb_risk_rate_per_100k_trips):
  Mean: 954.59
  Median: 773.83
  Std: 761.23

Prevention hotspots: 1,836

EB prior strength (m): 1,000

============================================================